neonati<-read.csv("neonati.csv",stringsAsFactors = T,sep=",")
summary(neonati)
##    Anni.madre     N.gravidanze       Fumatrici        Gestazione   
##  Min.   : 0.00   Min.   : 0.0000   Min.   :0.0000   Min.   :25.00  
##  1st Qu.:25.00   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:38.00  
##  Median :28.00   Median : 1.0000   Median :0.0000   Median :39.00  
##  Mean   :28.16   Mean   : 0.9812   Mean   :0.0416   Mean   :38.98  
##  3rd Qu.:32.00   3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.:40.00  
##  Max.   :46.00   Max.   :12.0000   Max.   :1.0000   Max.   :43.00  
##       Peso        Lunghezza         Cranio    Tipo.parto Ospedale   Sesso   
##  Min.   : 830   Min.   :310.0   Min.   :235   Ces: 728   osp1:816   F:1256  
##  1st Qu.:2990   1st Qu.:480.0   1st Qu.:330   Nat:1772   osp2:849   M:1244  
##  Median :3300   Median :500.0   Median :340              osp3:835           
##  Mean   :3284   Mean   :494.7   Mean   :340                                 
##  3rd Qu.:3620   3rd Qu.:510.0   3rd Qu.:350                                 
##  Max.   :4930   Max.   :565.0   Max.   :390

The dataset contains the following variables:

Let’s visualize now this variables

library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(neonati, aes(x = as.factor(Anni.madre))) + 
  geom_bar() +
  labs(title = "Distribution of years of mother", 
       x = "Year mother", 
       y = "Count") +
  theme_minimal()

The lowest values 0 and 1 years old are clearly wrong data and it is wise to remove them from the dataset. Looking qualitatively at the histogram, this variable seems to have a gaussian-shape and the most frequent values equal to 27 and 30 years old

ggplot(neonati, aes(x = as.factor(N.gravidanze))) + 
  geom_bar() +
  labs(title = "Distribution of Number of Pregnancies", 
       x = "Number of Pregnancies", 
       y = "Count") +
  theme_minimal()

From the histogram plot it is clear that the largest sub group is represented by women approaching to the first delivery. Then, generally the number of women decreases as the number of pregnancies increases.

ggplot(neonati, aes(x = as.factor(Gestazione))) + 
  geom_bar() +
  labs(title = "Distribution of Pregnancy period", 
       x = "Number of Pregnancie Period (in weeks)", 
       y = "Count") +
  theme_minimal()

We can see that 40 is the most frequent number of weeks for the last of the pregnancy and the largest part of women delivers close to this number of weeks

ggplot(neonati, aes(x = Peso)) + 
  geom_histogram(aes(y = after_stat(density)), binwidth = 90, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.2, fill = "red") +
  labs(title = "Distribution of Newborn Weight", 
       x = "Newborn Weight (grams)", 
       y = "Count") +
  theme_minimal()

ggplot(neonati, aes(x = Lunghezza)) + 
  geom_histogram(aes(y = after_stat(density)), binwidth = 10, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.2, fill = "red") +
  labs(title = "Distribution of Newborn Length", 
       x = "Newborn Length (in millimiters)", 
       y = "Count") +
  theme_minimal()

ggplot(neonati, aes(x = Cranio)) + 
  geom_histogram(aes(y = after_stat(density)), binwidth = 5, fill = "lightblue", color = "black") +
  geom_density(alpha = 0.2, fill = "red") +
  labs(title = "Distribution of Head Length", 
       x = "Head Length (in millimiters)", 
       y = "Count") +
  theme_minimal()

Peso, Lunghezza and Cranio show a gaussian-like distribution but with a longer left tail (left-skewed). So for Peso in particular we need to take care about the hypotesis of normality.

Desc(neonati$Tipo.parto, plotit=TRUE)
## ────────────────────────────────────────────────────────────────────────────── 
## neonati$Tipo.parto (factor - dichotomous)
## 
##   length      n    NAs unique
##    2'500  2'500      0      2
##          100.0%   0.0%       
## 
##       freq   perc  lci.95  uci.95'
## Ces    728  29.1%   27.4%   30.9%
## Nat  1'772  70.9%   69.1%   72.6%
## 
## ' 95%-CI (Wilson)

Tipo.parto shows that most women undergo a natural delivery.

Desc(neonati$Ospedale, plotit=TRUE)
## ────────────────────────────────────────────────────────────────────────────── 
## neonati$Ospedale (factor)
## 
##   length      n    NAs unique levels  dupes
##    2'500  2'500      0      3      3      y
##          100.0%   0.0%                     
## 
##    level  freq   perc  cumfreq  cumperc
## 1   osp2   849  34.0%      849    34.0%
## 2   osp3   835  33.4%    1'684    67.4%
## 3   osp1   816  32.6%    2'500   100.0%

Ospedale tell us that the women pregnancies were managed more or less equally from the three hospitals, with the largest number in Osp1.

Desc(neonati$Sesso, plotit=TRUE)
## ────────────────────────────────────────────────────────────────────────────── 
## neonati$Sesso (factor - dichotomous)
## 
##   length      n    NAs unique
##    2'500  2'500      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## F  1'256  50.2%   48.3%   52.2%
## M  1'244  49.8%   47.8%   51.7%
## 
## ' 95%-CI (Wilson)

Sex variable shows that female births are slightly larger than male births.

Let’s remove the wrong data corresponding to 0 and 1 years old.

neonati <- subset(neonati, Anni.madre >= 12)

ggplot(neonati, aes(x = as.factor(Anni.madre))) + 
  geom_bar() +
  labs(title = "Distribution of years of mother", 
       x = "Year mother", 
       y = "Count") +
  theme_minimal()

To verify if the mean of Peso and Lunghezza for this sample is significantly equal to the mean of the correspondent population mean, t-test is applied. Considering a Peso population mean of 3300 grams and a Length population mean of 500 millimiters:

peso_population_mean <- 3300
t_test_peso <- t.test(neonati$Peso, mu = peso_population_mean,
       conf.level = 0.95, 
       alternative = "two")
print(t_test_peso)
## 
##  One Sample t-test
## 
## data:  neonati$Peso
## t = -1.505, df = 2497, p-value = 0.1324
## alternative hypothesis: true mean is not equal to 3300
## 95 percent confidence interval:
##  3263.577 3304.791
## sample estimates:
## mean of x 
##  3284.184

The p-value (0.1324) is greater than the typical significance level of 0.05. Therefore, we fail to reject the null hypothesis. This means there is insufficient evidence to conclude that the mean weight of the neonates is different from the population mean of 3300 grams. The 95% confidence interval for the mean weight of the sample is (3263.577, 3304.791). Since this interval includes the population mean of 3300, it suggests that the true mean could plausibly be 3300 grams.

lunghezza_population_mean <- 500
t_test_lunghezza <- t.test(neonati$Lunghezza, mu = lunghezza_population_mean,
       conf.level = 0.95, 
       alternative = "two")
print(t_test_lunghezza)
## 
##  One Sample t-test
## 
## data:  neonati$Lunghezza
## t = -10.069, df = 2497, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 500
## 95 percent confidence interval:
##  493.6628 495.7287
## sample estimates:
## mean of x 
##  494.6958

Since the p-value is far less than 0.05, we reject the null hypothesis. This means there is strong evidence that the true mean length of the neonates is different from the population mean of 500. The 95% confidence interval for the sample mean length is (493.6628, 495.7287), which does not include 500. This further supports the conclusion that the true mean length is statistically different from 500.

Let’s verify now if these and other variables show significative differences by sex.

t_test_sesso_peso <- t.test(Peso ~ Sesso, data = neonati)
print(t_test_sesso_peso)
## 
##  Welch Two Sample t-test
## 
## data:  Peso by Sesso
## t = -12.115, df = 2488.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -287.4841 -207.3844
## sample estimates:
## mean in group F mean in group M 
##        3161.061        3408.496

Since the p-value is much smaller than 0.05, we reject the null hypothesis. This means there is strong evidence that the mean weight of neonates differs significantly between females (F) and males (M). The mean weight in group F (female) is 3161.061 grams, while the mean weight in group M (male) is 3408.496 grams. This difference is substantial, with males weighing more on average.

The visualization of the box plot makes more easy to qualitatively detect this significant difference:

Desc(Peso ~ Sesso, neonati, digits=1, plotit=TRUE)
## ────────────────────────────────────────────────────────────────────────────── 
## Peso ~ Sesso (neonati)
## 
## Summary: 
## n pairs: 2'498, valid: 2'498 (100.0%), missings: 0 (0.0%), groups: 2
## 
##                         
##               F        M
## mean    3'161.1  3'408.5
## median  3'160.0  3'430.0
## sd        526.5    493.9
## IQR       570.0    570.0
## n         1'255    1'243
## np        50.2%    49.8%
## NAs           0        0
## 0s            0        0
## 
## Kruskal-Wallis rank sum test:
##   Kruskal-Wallis chi-squared = 181.01, df = 1, p-value < 2.2e-16

t_test_sesso_lunghezza <- t.test(Lunghezza ~ Sesso, data = neonati)
print(t_test_sesso_lunghezza)
## 
##  Welch Two Sample t-test
## 
## data:  Lunghezza by Sesso
## t = -9.5823, df = 2457.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -11.939001  -7.882672
## sample estimates:
## mean in group F mean in group M 
##        489.7641        499.6750

Since the p-value is much smaller than 0.05, we reject the null hypothesis. So also in this case there is strong evidence that there is a significant difference in mean length between female and male neonates. The mean length in group F (female) is 489.7641 mm, while the mean length in group M (male) is 499.6750 mm. This suggests that male neonates are, on average, longer than female neonates.

This is confirmed by the boxplot:

Desc(Lunghezza ~ Sesso, neonati, digits=1, plotit=TRUE)
## ────────────────────────────────────────────────────────────────────────────── 
## Lunghezza ~ Sesso (neonati)
## 
## Summary: 
## n pairs: 2'498, valid: 2'498 (100.0%), missings: 0 (0.0%), groups: 2
## 
##                     
##             F      M
## mean    489.8  499.7
## median  490.0  500.0
## sd       27.5   24.0
## IQR      25.0   25.0
## n       1'255  1'243
## np      50.2%  49.8%
## NAs         0      0
## 0s          0      0
## 
## Kruskal-Wallis rank sum test:
##   Kruskal-Wallis chi-squared = 108.02, df = 1, p-value < 2.2e-16

t_test_fumo_peso <- t.test(Peso ~ Fumatrici, data = neonati)
print(t_test_fumo_peso)
## 
##  Welch Two Sample t-test
## 
## data:  Peso by Fumatrici
## t = 1.0362, df = 114.12, p-value = 0.3023
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -45.5076 145.3399
## sample estimates:
## mean in group 0 mean in group 1 
##        3286.262        3236.346

There is no statistically significant difference in the weights of neonates based on maternal smoking status. The mean weight of neonates born to non-smokers is slightly higher than that of neonates born to smokers, but the evidence does not support a strong conclusion about this difference

Desc(Peso ~ Fumatrici, neonati, digits=1, plotit=TRUE)
## ────────────────────────────────────────────────────────────────────────────── 
## Peso ~ Fumatrici (neonati)
## 
## Summary: 
## n pairs: 2'498, valid: 2'498 (100.0%), missings: 0 (0.0%), groups: 2
## 
##                         
##               0        1
## mean    3'286.3  3'236.3
## median  3'300.0  3'175.0
## sd        527.1    478.8
## IQR       620.0    500.0
## n         2'394      104
## np        95.8%     4.2%
## NAs           0        0
## 0s            0        0
## 
## Kruskal-Wallis rank sum test:
##   Kruskal-Wallis chi-squared = 3.5576, df = 1, p-value = 0.05927

(z <- Desc(Peso ~ Fumatrici, neonati, test=t.test, digitd=1, plotit=FALSE))
## ────────────────────────────────────────────────────────────────────────────── 
## Peso ~ Fumatrici (neonati)
## 
## Summary: 
## n pairs: 2'498, valid: 2'498 (100.0%), missings: 0 (0.0%), groups: 2
## 
##                         
##               0        1
## mean    3'286.3  3'236.3
## median  3'300.0  3'175.0
## sd        527.1    478.8
## IQR       620.0    500.0
## n         2'394      104
## np        95.8%     4.2%
## NAs           0        0
## 0s            0        0
## 
## Welch Two Sample t-test:
##   t = 1.0362, df = 114.12, p-value = 0.3023
plot(z, type="dens")

To check if in some hospitals caesarean births occur significantly more often than natural births let’s perform a Chi-squared test of independence: it allows to assess whether there is a significant association between the two categorical variables: type of delivery (Tipo.parto) and the hospital (Ospedale) in which the delivery occurred

contingenza <- table(neonati$Tipo.parto, neonati$Ospedale)
chi_test <- chisq.test(contingenza)
print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  contingenza
## X-squared = 1.083, df = 2, p-value = 0.5819

X-squared of 1.083 is the value of the Chi-squared statistic. It measures the discrepancy between the observed frequencies (actual counts in the contingency table) and the expected frequencies (counts we would expect if there were no association between the variables). Since the p-value (0.5819) is much greater than the common significance level of 0.05, we fail to reject the null hypothesis. This suggests that there is no statistically significant association between the type of delivery and the hospital where the delivery occurred.

Desc(Tipo.parto ~ Ospedale, neonati, digits=1,plotit=TRUE)
## ────────────────────────────────────────────────────────────────────────────── 
## Tipo.parto ~ Ospedale (neonati)
## 
## Summary: 
## n: 2'498, rows: 3, columns: 2
## 
## Pearson's Chi-squared test:
##   X-squared = 1.083, df = 2, p-value = 0.5819
## Log likelihood ratio (G-test) test of independence:
##   G = 1.0877, X-squared df = 2, p-value = 0.5805
## Mantel-Haenszel Chi-squared:
##   X-squared = 0.68196, df = 1, p-value = 0.4089
## 
## Contingency Coeff.     0.021
## Cramer's V             0.021
## Kendall Tau-b          0.016
## 
##                                           
##            Tipo.parto    Ces    Nat    Sum
## Ospedale                                  
##                                           
## osp1       freq          242    574    816
##            perc         9.7%  23.0%  32.7%
##            p.row       29.7%  70.3%      .
##            p.col       33.2%  32.4%      .
##                                           
## osp2       freq          254    594    848
##            perc        10.2%  23.8%  33.9%
##            p.row       30.0%  70.0%      .
##            p.col       34.9%  33.6%      .
##                                           
## osp3       freq          232    602    834
##            perc         9.3%  24.1%  33.4%
##            p.row       27.8%  72.2%      .
##            p.col       31.9%  34.0%      .
##                                           
## Sum        freq          728  1'770  2'498
##            perc        29.1%  70.9% 100.0%
##            p.row           .      .      .
##            p.col           .      .      .
## 

Moreover also newborn weight seems not depending on the kind of hospital:

boxplot(neonati$Peso ~ neonati$Ospedale, 
                    xlab = "Ospedale", 
                    ylab = "Peso (g)", 
                    main = "Peso del Neonato per Ospedale",
                    col = terrain.colors(3))

mylevels <- levels(neonati$Ospedale)
levelProportions <- summary(neonati$Ospedale) / nrow(neonati[, c('Ospedale','Peso')])

for (i in 1:length(mylevels)) {

  thisvalues <- neonati[neonati$Ospedale == mylevels[i], "Peso"]
  myjitter <- jitter(rep(i, length(thisvalues)), amount = levelProportions[i] / 2)
  points(myjitter, thisvalues, pch = 20, cex= .5, col = rgb(0, 0, 0, .9)) 
}

t_test_tipo_parto_peso <- t.test(Peso ~ Tipo.parto, data = neonati)
print(t_test_tipo_parto_peso)
## 
##  Welch Two Sample t-test
## 
## data:  Peso by Tipo.parto
## t = -0.13626, df = 1494.4, p-value = 0.8916
## alternative hypothesis: true difference in means between group Ces and group Nat is not equal to 0
## 95 percent confidence interval:
##  -46.44246  40.40931
## sample estimates:
## mean in group Ces mean in group Nat 
##          3282.047          3285.063

Also in this case, there is no statistically significant difference in the weights of neonates based on the typo of birth.

boxplot(neonati$Peso ~ neonati$Tipo.parto, 
                    xlab = "Tipo di Parto", 
                    ylab = "Peso (g)", 
                    main = "Peso del Neonato per Tipo di Parto",
                    col = terrain.colors(3))

mylevels <- levels(neonati$Tipo.parto)
levelProportions <- summary(neonati$Tipo.parto) / nrow(neonati[, c('Tipo.parto','Peso')])

for (i in 1:length(mylevels)) {

  thisvalues <- neonati[neonati$Tipo.parto == mylevels[i], "Peso"]
  myjitter <- jitter(rep(i, length(thisvalues)), amount = levelProportions[i] / 2)
  points(myjitter, thisvalues, pch = 20, cex= .5, col = rgb(0, 0, 0, .9)) 
}

Let’s now look at the correlations inside the dataset.

library(DescTools)
par(mfrow=c(1,1))
m <- cor(neonati[,which(sapply(neonati, is.numeric))], use="pairwise.complete.obs")
col_palette <- colorRampPalette(c("red", "white", "blue"))(100)
PlotCorr(m, col=col_palette, border="grey",
 args.colorlegend=list(labels=Format(seq(-1,1,.25), 2), frame="grey"))

PlotWeb(m, col=c(hred, hblue))

Lunghezza and Cranio are highly correlated with Peso and will be essential in the predictive model. They provide significant information on the baby’s overall growth, making them strong predictors of weight. Gestazione is another key predictor: longer gestation times lead to heavier babies, so including gestation as a variable will improve the model’s accuracy. Anni.madre, N.gravidanze, and Fumatrici show weak correlations with Peso, implying that they may not add much predictive power.

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- cor(x, y, use = "complete.obs")
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor *1.3)
}

pairs(neonati[, c("Peso", "Cranio", "Gestazione", "Lunghezza")],lower.panel=panel.cor, upper.panel=panel.smooth, pch = 19, cex.labels = 1.3)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

shapiro.test(neonati$Peso)
## 
##  Shapiro-Wilk normality test
## 
## data:  neonati$Peso
## W = 0.97068, p-value < 2.2e-16
moments::skewness(neonati$Peso)
## [1] -0.6474036
moments::kurtosis(neonati$Peso)-3
## [1] 2.028753

The Shapiro-Wilk normality test strongly suggests that the Peso variable does not follow a normal distribution. The very low p-value indicates a statistically significant deviation from normality. Such deviations from normality of the response variable most of the time also affect the residuals after calculating the model. Hence, the most critical thing to check next is whether the residuals of our model are normally distributed.

A skewness of -0.647 is moderately negative but not extreme, suggesting some asymmetry in the distribution. Excess kurtosis of -0.9712 indicates the distribution has light tails and is slightly platykurtic. In other words, the distribution is flatter or less peaked than a normal distribution, with fewer outliers in the tails.

Therefore Shapiro-Wilk test, combined with the skewness and kurtosis values, confirms that the Peso variable is not normally distributed. This needs to be taken into consideration for the further multidimensional analysis and searching of the ‘best’ predicting model for newborn weight.

attach(neonati)
mod<-lm(Peso~Gestazione+Lunghezza+Cranio+Fumatrici+N.gravidanze+Anni.madre+Tipo.parto+Ospedale+Sesso)
summary(mod)
## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + Fumatrici + 
##     N.gravidanze + Anni.madre + Tipo.parto + Ospedale + Sesso)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1123.26  -181.53   -14.45   161.05  2611.89 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6735.7960   141.4790 -47.610  < 2e-16 ***
## Gestazione       32.5773     3.8208   8.526  < 2e-16 ***
## Lunghezza        10.2922     0.3009  34.207  < 2e-16 ***
## Cranio           10.4722     0.4263  24.567  < 2e-16 ***
## Fumatrici       -30.2741    27.5492  -1.099   0.2719    
## N.gravidanze     11.3812     4.6686   2.438   0.0148 *  
## Anni.madre        0.8018     1.1467   0.699   0.4845    
## Tipo.partoNat    29.6335    12.0905   2.451   0.0143 *  
## Ospedaleosp2    -11.0912    13.4471  -0.825   0.4096    
## Ospedaleosp3     28.2495    13.5054   2.092   0.0366 *  
## SessoM           77.5723    11.1865   6.934 5.18e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274 on 2487 degrees of freedom
## Multiple R-squared:  0.7289, Adjusted R-squared:  0.7278 
## F-statistic: 668.7 on 10 and 2487 DF,  p-value: < 2.2e-16

Significant Predictors:

Non-Significant Predictors:

Multiple R-squared (0.7289) and Adjusted R-squared (0.7278) indicate that about 72.89% of the variability in birth weight is explained by the model. This is quite a strong result, meaning the predictors collectively do a good job of explaining the variance in birth weight. The very low p-value indicates that the model as a whole is statistically significant. The significant predictors, such as gestational age, length, head circumference, number of pregnancies, type of birth, hospital, and sex, may be initially kept in the model as they have clear and statistically significant relationships with birth weight. The non-significant predictors (mother’s age, smoking status, and some hospital variables) could potentially be removed to simplify the model. However we can also consider checking for interactions or nonlinear relationships.

mod2 <- update(mod,~.-Fumatrici-Anni.madre-Ospedale)
summary(mod2)
## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + N.gravidanze + 
##     Tipo.parto + Sesso)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1129.14  -181.97   -16.26   160.95  2638.18 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6708.0171   136.0715 -49.298  < 2e-16 ***
## Gestazione       32.3253     3.7969   8.514  < 2e-16 ***
## Lunghezza        10.2833     0.3009  34.177  < 2e-16 ***
## Cranio           10.5063     0.4263  24.648  < 2e-16 ***
## N.gravidanze     12.7356     4.3385   2.935  0.00336 ** 
## Tipo.partoNat    30.1601    12.1027   2.492  0.01277 *  
## SessoM           77.9171    11.1994   6.957 4.42e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.4 on 2491 degrees of freedom
## Multiple R-squared:  0.7277, Adjusted R-squared:  0.727 
## F-statistic:  1109 on 6 and 2491 DF,  p-value: < 2.2e-16

By removing Fumatrici, Anni.madre, and Ospedale, we’ve simplified the model without substantially sacrificing predictive power or fit. The R-squared has only dropped slightly, and the remaining predictors continue to have similar significance and effect sizes. Since Fumatrici and Anni.madre were non-significant in the original model, and only one of the hospital variables (Ospedaleosp3) was marginally significant, their removal makes the model more parsimonious without much loss in explanatory power.

mod3<-update(mod2,~.-N.gravidanze-Tipo.parto)
summary(mod3)
## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + Sesso)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1138.1  -184.4   -17.4   163.3  2626.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6651.6732   135.5952 -49.055  < 2e-16 ***
## Gestazione     31.3262     3.7884   8.269  < 2e-16 ***
## Lunghezza      10.2024     0.3009  33.909  < 2e-16 ***
## Cranio         10.6706     0.4247  25.126  < 2e-16 ***
## SessoM         79.1027    11.2205   7.050 2.31e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 275.1 on 2493 degrees of freedom
## Multiple R-squared:  0.7261, Adjusted R-squared:  0.7257 
## F-statistic:  1652 on 4 and 2493 DF,  p-value: < 2.2e-16

Impact of Removing N.gravidanze and Tipo.parto: While both variables were significant in mod2, their removal has had minimal effect on the performance metrics of the model. The coefficients for the remaining predictors have remained relatively stable, and the R-squared has only dropped slightly. This suggests that the bulk of the predictive power is captured by the four remaining variables (gestation, length, head circumference, and sex). By removing these two variables, we have simplified the model further, reducing the number of predictors, while sacrificing only a small amount of predictive accuracy. The slightly increased residual standard error suggests a small increase in prediction error, but the simplicity gained by reducing the model complexity may outweigh this trade-off.

mod4<-update(mod2,~.-Tipo.parto)
summary(mod4)
## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + N.gravidanze + 
##     Sesso)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1149.37  -180.98   -15.57   163.69  2639.09 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6681.7251   135.8036 -49.201  < 2e-16 ***
## Gestazione      32.3827     3.8008   8.520  < 2e-16 ***
## Lunghezza       10.2455     0.3008  34.059  < 2e-16 ***
## Cranio          10.5410     0.4265  24.717  < 2e-16 ***
## N.gravidanze    12.4554     4.3416   2.869  0.00415 ** 
## SessoM          77.9807    11.2111   6.956 4.47e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.7 on 2492 degrees of freedom
## Multiple R-squared:  0.727,  Adjusted R-squared:  0.7265 
## F-statistic:  1327 on 5 and 2492 DF,  p-value: < 2.2e-16

Keeping also N.gravidanze slightly increases residual standard error suggesting a small increase in prediction error. So the choice between mod3 and mod4 is just about how much we want to prefer model simplicity/complexity given an almost identical overall performance.

From the above scatterplot it can appear a non linear relationship between Peso and Lunghezza. Let’s try to add this non linear contribute using a quadratic term:

mod5<-update(mod3,~.+I(Lunghezza^2))
summary(mod5)
## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + Sesso + 
##     I(Lunghezza^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1156.67  -182.24   -12.53   166.61  1783.50 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    153.596868 725.069894   0.212    0.832    
## Gestazione      41.221786   3.863307  10.670  < 2e-16 ***
## Lunghezza      -19.905661   3.167097  -6.285 3.85e-10 ***
## Cranio          10.796724   0.417414  25.866  < 2e-16 ***
## SessoM          71.343224  11.052836   6.455 1.30e-10 ***
## I(Lunghezza^2)   0.031230   0.003271   9.548  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 270.3 on 2492 degrees of freedom
## Multiple R-squared:  0.7358, Adjusted R-squared:  0.7352 
## F-statistic:  1388 on 5 and 2492 DF,  p-value: < 2.2e-16

The increment in accuracy from adding the quadratic term is statistically significant but the change in predictive accuracy (a 1% increase in R-squared and a 4.8-gram reduction in residual error) is relatively modest. If we prioritize simplicity and ease of interpretation, then sticking with the simpler model (mod3) without the quadratic term is a reasonable choice. The simpler model still explains around 72.6% of the variance in birth weight and provides interpretable coefficients with minimal complexity.

Let’s try to add for example the effect of interaction between Lunghezza and Cranio

mod6<-lm(Peso~Lunghezza*Cranio)
summary(mod6)
## 
## Call:
## lm(formula = Peso ~ Lunghezza * Cranio)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1237.23  -181.87   -13.29   165.47  2924.05 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -3.754e+03  1.013e+03  -3.705 0.000216 ***
## Lunghezza         6.316e+00  2.110e+00   2.993 0.002785 ** 
## Cranio            3.477e+00  3.106e+00   1.119 0.263039    
## Lunghezza:Cranio  1.621e-02  6.385e-03   2.539 0.011177 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 281.2 on 2494 degrees of freedom
## Multiple R-squared:  0.7137, Adjusted R-squared:  0.7133 
## F-statistic:  2072 on 3 and 2494 DF,  p-value: < 2.2e-16

It seems that the interaction term between Lunghezza and Cranio does not provide a substantial improvement to the model’s performance, and may even slightly harm it. Therefore, mod3 (the simpler model) is likely preferable in this case. Verifying also other possible interactions is seems not beneficial: for this case interactions just complicate the model without offering significant gains in predictive performance.

anova(mod3,mod6)
## Analysis of Variance Table
## 
## Model 1: Peso ~ Gestazione + Lunghezza + Cranio + Sesso
## Model 2: Peso ~ Lunghezza * Cranio
##   Res.Df       RSS Df Sum of Sq      F    Pr(>F)    
## 1   2493 188663107                                  
## 2   2494 197233092 -1  -8569985 113.24 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova confirms that the interaction is statistically significant, but it does not improve the model’s fit sufficiently to justify its inclusion. mod3, the simpler model without the interaction term, is preferred based on the overall model performance and its ability to explain the variability in Peso

n <- nrow(neonati)
stepwise.mod <- MASS::stepAIC(mod,direction="both",k=log(n))
## Start:  AIC=28118.61
## Peso ~ Gestazione + Lunghezza + Cranio + Fumatrici + N.gravidanze + 
##     Anni.madre + Tipo.parto + Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Anni.madre    1     36710 186779904 28111
## - Fumatrici     1     90677 186833870 28112
## - Ospedale      2    687555 187430749 28112
## - N.gravidanze  1    446244 187189438 28117
## - Tipo.parto    1    451073 187194266 28117
## <none>                      186743194 28119
## - Sesso         1   3610705 190353899 28159
## - Gestazione    1   5458852 192202046 28183
## - Cranio        1  45318506 232061700 28654
## - Lunghezza     1  87861708 274604902 29074
## 
## Step:  AIC=28111.28
## Peso ~ Gestazione + Lunghezza + Cranio + Fumatrici + N.gravidanze + 
##     Tipo.parto + Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Fumatrici     1     91599 186871503 28105
## - Ospedale      2    693914 187473818 28105
## - Tipo.parto    1    452049 187231953 28110
## <none>                      186779904 28111
## - N.gravidanze  1    631082 187410986 28112
## + Anni.madre    1     36710 186743194 28119
## - Sesso         1   3617809 190397713 28151
## - Gestazione    1   5424800 192204704 28175
## - Cranio        1  45569477 232349381 28649
## - Lunghezza     1  87852027 274631931 29066
## 
## Step:  AIC=28104.68
## Peso ~ Gestazione + Lunghezza + Cranio + N.gravidanze + Tipo.parto + 
##     Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Ospedale      2    702925 187574428 28098
## - Tipo.parto    1    444404 187315907 28103
## <none>                      186871503 28105
## - N.gravidanze  1    608136 187479640 28105
## + Fumatrici     1     91599 186779904 28111
## + Anni.madre    1     37633 186833870 28112
## - Sesso         1   3601860 190473363 28145
## - Gestazione    1   5358199 192229702 28168
## - Cranio        1  45613331 232484834 28642
## - Lunghezza     1  88259386 275130889 29063
## 
## Step:  AIC=28098.41
## Peso ~ Gestazione + Lunghezza + Cranio + N.gravidanze + Tipo.parto + 
##     Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Tipo.parto    1    467626 188042054 28097
## <none>                      187574428 28098
## - N.gravidanze  1    648873 188223301 28099
## + Ospedale      2    702925 186871503 28105
## + Fumatrici     1    100610 187473818 28105
## + Anni.madre    1     44184 187530244 28106
## - Sesso         1   3644818 191219246 28139
## - Gestazione    1   5457887 193032315 28162
## - Cranio        1  45747094 233321522 28636
## - Lunghezza     1  87955701 275530129 29051
## 
## Step:  AIC=28096.81
## Peso ~ Gestazione + Lunghezza + Cranio + N.gravidanze + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## <none>                      188042054 28097
## - N.gravidanze  1    621053 188663107 28097
## + Tipo.parto    1    467626 187574428 28098
## + Ospedale      2    726146 187315907 28103
## + Fumatrici     1     92548 187949505 28103
## + Anni.madre    1     45366 187996688 28104
## - Sesso         1   3650790 191692844 28137
## - Gestazione    1   5477493 193519547 28161
## - Cranio        1  46098547 234140601 28637
## - Lunghezza     1  87532691 275574744 29044
summary(stepwise.mod)
## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + N.gravidanze + 
##     Sesso)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1149.37  -180.98   -15.57   163.69  2639.09 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6681.7251   135.8036 -49.201  < 2e-16 ***
## Gestazione      32.3827     3.8008   8.520  < 2e-16 ***
## Lunghezza       10.2455     0.3008  34.059  < 2e-16 ***
## Cranio          10.5410     0.4265  24.717  < 2e-16 ***
## N.gravidanze    12.4554     4.3416   2.869  0.00415 ** 
## SessoM          77.9807    11.2111   6.956 4.47e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.7 on 2492 degrees of freedom
## Multiple R-squared:  0.727,  Adjusted R-squared:  0.7265 
## F-statistic:  1327 on 5 and 2492 DF,  p-value: < 2.2e-16

MASS stepAIC procedure chooses what we called mod4.

MASS library offers also the possibility to use the Box-Cox transformation, a method that can help to identify a possible optimal transformation of the response variable (in this case, Peso) that minimizes residual sum of squares (RSS) and improves model fit.

library(MASS)
boxcox(mod3) 

The plot shows a quite broad peak corresponding to lambda spanning from 0 to 1. So we can try a log-transformation of Peso (corresponding to lambda = 0) or a square root transformation of Peso (corresponding to lambda = 0.5).

mod7 <- lm(log(Peso) ~ Gestazione + Lunghezza + Cranio + Sesso)
summary(mod7)
## 
## Call:
## lm(formula = log(Peso) ~ Gestazione + Lunghezza + Cranio + Sesso)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42127 -0.05497  0.00098  0.05408  0.83845 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.411e+00  4.275e-02 103.176  < 2e-16 ***
## Gestazione  1.824e-02  1.194e-03  15.272  < 2e-16 ***
## Lunghezza   3.516e-03  9.486e-05  37.066  < 2e-16 ***
## Cranio      3.564e-03  1.339e-04  26.618  < 2e-16 ***
## SessoM      1.842e-02  3.537e-03   5.207 2.08e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08673 on 2493 degrees of freedom
## Multiple R-squared:  0.7765, Adjusted R-squared:  0.7762 
## F-statistic:  2166 on 4 and 2493 DF,  p-value: < 2.2e-16

Compared to mod3, applying log transformation on Peso increases by almost 5% the capacity of the model to explain the variability in birth weight (from 72.89% to 77.62%)

mod8 <- lm(sqrt(Peso) ~ Gestazione + Lunghezza + Cranio + Sesso)
summary(mod8)
## 
## Call:
## lm(formula = sqrt(Peso) ~ Gestazione + Lunghezza + Cranio + Sesso)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1544  -1.5729  -0.0516   1.4626  23.2524 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -37.288120   1.175934  -31.71  < 2e-16 ***
## Gestazione    0.380943   0.032855   11.60  < 2e-16 ***
## Lunghezza     0.093769   0.002609   35.94  < 2e-16 ***
## Cranio        0.096601   0.003683   26.23  < 2e-16 ***
## SessoM        0.620852   0.097308    6.38  2.1e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.386 on 2493 degrees of freedom
## Multiple R-squared:  0.7565, Adjusted R-squared:  0.7561 
## F-statistic:  1937 on 4 and 2493 DF,  p-value: < 2.2e-16

Compared to mod3, applying square root transformation on Peso increases by about 3% the capacity of the model to explain the variability in birth weight (from 72.89% to 75.61%)

AIC(mod,mod2,mod3,mod4,mod5,mod7,mod8)
##      df      AIC
## mod  12 35145.57
## mod2  8 35148.67
## mod3  6 35159.12
## mod4  7 35152.89
## mod5  7 35071.37
## mod7  6 -5119.14
## mod8  6 11440.05
BIC(mod,mod2,mod3,mod4,mod5,mod7,mod8)
##      df      BIC
## mod  12 35215.45
## mod2  8 35195.25
## mod3  6 35194.06
## mod4  7 35193.65
## mod5  7 35112.13
## mod7  6 -5084.20
## mod8  6 11474.99

mod (with the full set of predictors) has the highest AIC and BIC, indicating overfitting, and suggesting that some predictors may not contribute significantly to improving model performance. mod3 and mod4 (simpler models) both perform similarly in terms of AIC and BIC, but they perform slightly worse than mod5. mod5, the one in which the quadratic term for Lunghezza is added, is the best performing model among those without log transformations, but its AIC and BIC are much higher than mod7 and mod8, implying it fits the data less well. Again, the choice between mod3/mod4 and mod5 is about preference between model simplicity/slightly worse model performance and model complexity/slightly better model performance. mod7 (log transformations of variables) has the lowest AIC and BIC values, both substantially negative, which indicates it offers the best trade-off between model fit and complexity.

library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
## 
##     Recode
car::vif(mod7)
## Gestazione  Lunghezza     Cranio      Sesso 
##   1.654101   2.070582   1.606316   1.038918

The Variance Inflation Factor (VIF) values provided from mod7 (but also for mod3 for example) suggest that multicollinearity is not a major concern for the predictors because all the VIF values are below 5.9 So we can proceed with interpreting the results from the regression model without needing to address collinearity issues.

par(mfrow=c(2,4))
plot(mod3)
mtext("mod 3", side = 3, line = -2, outer = TRUE, cex = 1.5, adj = 0.5)
plot(mod7)
mtext("mod 7", side = 3, line = -19, outer = TRUE, cex = 1.5, adj = 0.5)

Both mod3 and mod7 shows some issues with non-linearity, heteroscedasticity and normality of residuals. This is evident from the curved pattern in the residual plots and the deviation from normality in the Q-Q plot. Some refinements may be needed to fully meet the assumptions of linear regression and maybe make me choose the final model.

par(mfrow=c(1,2))
cook3<-cooks.distance(mod3)
plot(cook3,ylim = c(0,2))
max(cook3)
## [1] 0.9780498
cook7<-cooks.distance(mod7)
plot(cook7,ylim = c(0,2))

max(cook7)
## [1] 1.002634

Both models (mod3 and mod7) have a single influential point with a Cook’s distance around 1, suggesting that this point may have a significant impact on the regression results. We should examine this observation (1549) to understand its influence and consider whether it affects the overall model performance.

neonati[1549, ]
##      Anni.madre N.gravidanze Fumatrici Gestazione Peso Lunghezza Cranio
## 1551         35            1         0         38 4370       315    374
##      Tipo.parto Ospedale Sesso
## 1551        Nat     osp3     F

Comparing this observation with the Peso and Lunghezza distribution, we can notice that for this newborn we have a Lunghezza value of 315mm (one of the lowest values recorded in this dataset, left tail of the distribution) corresponding to a relatively large Peso value of 4370 grams (right tail of Peso distribution). This may explain the ‘bad’ influence of this point on the regression results. Let’s try to remove this observation and check if this action provides an improvement.

neonati_try <- neonati[-c(1549), ]
mod_try<-lm(Peso~Gestazione+Lunghezza+Cranio+Sesso, data=neonati_try)
summary(mod_try)
## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + Sesso, 
##     data = neonati_try)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1153.75  -182.01   -14.88   163.78  1391.18 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6652.115    132.990 -50.020  < 2e-16 ***
## Gestazione     28.533      3.726   7.657 2.69e-14 ***
## Lunghezza      10.841      0.302  35.903  < 2e-16 ***
## Cranio         10.059      0.421  23.893  < 2e-16 ***
## SessoM         79.321     11.005   7.208 7.51e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 269.8 on 2492 degrees of freedom
## Multiple R-squared:  0.7362, Adjusted R-squared:  0.7358 
## F-statistic:  1739 on 4 and 2492 DF,  p-value: < 2.2e-16
mod_log_try<-lm(log(Peso)~Gestazione+Lunghezza+Cranio+Sesso, data=neonati_try)
summary(mod_log_try)
## 
## Call:
## lm(formula = log(Peso) ~ Gestazione + Lunghezza + Cranio + Sesso, 
##     data = neonati_try)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40551 -0.05444  0.00034  0.05530  0.50123 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.410e+00  4.191e-02 105.246  < 2e-16 ***
## Gestazione  1.735e-02  1.174e-03  14.776  < 2e-16 ***
## Lunghezza   3.720e-03  9.515e-05  39.094  < 2e-16 ***
## Cranio      3.369e-03  1.327e-04  25.392  < 2e-16 ***
## SessoM      1.849e-02  3.468e-03   5.332 1.06e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08502 on 2492 degrees of freedom
## Multiple R-squared:  0.7851, Adjusted R-squared:  0.7848 
## F-statistic:  2276 on 4 and 2492 DF,  p-value: < 2.2e-16

Comparing to mod3 and mod7 Adjusted R-squared, the removal of observation 1549 increases the overall model performance of both models.

par(mfrow=c(2,4))
plot(mod_try)
plot(mod_log_try)

par(mfrow=c(1,2))
cook_try<-cooks.distance(mod_try)
plot(cook_try,ylim = c(0,1))
max(cook_try)
## [1] 0.07668711
cook_log_try<-cooks.distance(mod_log_try)
plot(cook_log_try,ylim = c(0,1))

max(cook_log_try)
## [1] 0.1247184

We still see slight curved patterns in the residual plots and the deviation from normality in the Q-Q plot, but no influential points looking at the Cook’s distance

Let’s now carry out the tests on the residuals and check possible improvements by the removal of that observation

library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(mod3)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod3
## BP = 89.198, df = 4, p-value < 2.2e-16
dwtest(mod3)
## 
##  Durbin-Watson test
## 
## data:  mod3
## DW = 1.9553, p-value = 0.1318
## alternative hypothesis: true autocorrelation is greater than 0
bptest(mod7)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod7
## BP = 167.96, df = 4, p-value < 2.2e-16
dwtest(mod7)
## 
##  Durbin-Watson test
## 
## data:  mod7
## DW = 1.9487, p-value = 0.09966
## alternative hypothesis: true autocorrelation is greater than 0
bptest(mod_try)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod_try
## BP = 9.3293, df = 4, p-value = 0.05338
dwtest(mod_try)
## 
##  Durbin-Watson test
## 
## data:  mod_try
## DW = 1.9555, p-value = 0.1325
## alternative hypothesis: true autocorrelation is greater than 0
bptest(mod_log_try)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod_log_try
## BP = 87.525, df = 4, p-value < 2.2e-16
dwtest(mod_log_try)
## 
##  Durbin-Watson test
## 
## data:  mod_log_try
## DW = 1.9507, p-value = 0.1085
## alternative hypothesis: true autocorrelation is greater than 0

None of the models show strong evidence of autocorrelation, as the Durbin-Watson statistics are close to 2 and the p-values are not significant. Therefore, there seems to be no serious problem with autocorrelation in the residuals of these models Insted, while mod3, mod7, and mod_log_try show strong evidence of heteroscedasticity, mod_try is the only one that shows a mild indication of it (p-value = 0.05338). So mod_try is the model less close to a variance variability of the residuals (which violates one of the assumptions of linear regression).

shapiro.test(mod3$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod3$residuals
## W = 0.97426, p-value < 2.2e-16
plot(density(residuals(mod3)))

shapiro.test(mod3$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod3$residuals
## W = 0.97426, p-value < 2.2e-16
plot(density(residuals(mod7)))

par(mfrow=c(1,2))
shapiro.test(mod_try$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod_try$residuals
## W = 0.98868, p-value = 3.6e-13
plot(density(residuals(mod_try)))
shapiro.test(mod_log_try$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod_log_try$residuals
## W = 0.99026, p-value = 5.548e-12
plot(density(residuals(mod_log_try)))

For all models, the residuals are not normally distributed based on the Shapiro-Wilk test. The p-values are all well below 0.05, which suggests that the assumption of normality is violated in each case. mod_log_try and mod_try show residuals that are closer to normal (even though both show significant departures from normality), with respect to mod3 and mod7 that have the most pronounced departures from normality, with a very low W statistic and a p-value extremely close to zero.

The further removal of other two possible observations that from the QQ plots appear ‘far’ from the line corresponding normality (1306 and 155) does not change the overall model perfomances of mod_try and mod_log_try, even though improves the Shapiro-Wilk test (not enough to pass the residuals normality assumption)

neonati_try_2 <- neonati_try[-c(1306,155), ]
mod_try_2<-lm(Peso~Gestazione+Lunghezza+Cranio+Sesso, data=neonati_try_2)
summary(mod_try_2)
## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + Sesso, 
##     data = neonati_try_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1155.01  -181.09   -14.37   163.36  1311.36 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6678.1855   132.4717 -50.412  < 2e-16 ***
## Gestazione     28.4511     3.7093   7.670 2.45e-14 ***
## Lunghezza      10.9496     0.3012  36.353  < 2e-16 ***
## Cranio          9.9887     0.4192  23.831  < 2e-16 ***
## SessoM         77.4488    10.9583   7.068 2.04e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 268.5 on 2490 degrees of freedom
## Multiple R-squared:  0.739,  Adjusted R-squared:  0.7385 
## F-statistic:  1762 on 4 and 2490 DF,  p-value: < 2.2e-16
mod_log_try_2<-lm(log(Peso)~Gestazione+Lunghezza+Cranio+Sesso, data=neonati_try_2)
summary(mod_log_try_2)
## 
## Call:
## lm(formula = log(Peso) ~ Gestazione + Lunghezza + Cranio + Sesso, 
##     data = neonati_try_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40144 -0.05462  0.00044  0.05552  0.38559 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.401e+00  4.167e-02 105.608  < 2e-16 ***
## Gestazione  1.733e-02  1.167e-03  14.852  < 2e-16 ***
## Lunghezza   3.759e-03  9.475e-05  39.669  < 2e-16 ***
## Cranio      3.344e-03  1.319e-04  25.360  < 2e-16 ***
## SessoM      1.779e-02  3.447e-03   5.162 2.64e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08445 on 2490 degrees of freedom
## Multiple R-squared:  0.7881, Adjusted R-squared:  0.7878 
## F-statistic:  2315 on 4 and 2490 DF,  p-value: < 2.2e-16
par(mfrow=c(2,4))
plot(mod_try_2)
plot(mod_log_try_2)

bptest(mod_try_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod_try_2
## BP = 5.9057, df = 4, p-value = 0.2063
dwtest(mod_try_2)
## 
##  Durbin-Watson test
## 
## data:  mod_try_2
## DW = 1.9569, p-value = 0.1406
## alternative hypothesis: true autocorrelation is greater than 0
bptest(mod_log_try_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod_log_try_2
## BP = 73.843, df = 4, p-value = 3.5e-15
dwtest(mod_log_try_2)
## 
##  Durbin-Watson test
## 
## data:  mod_log_try_2
## DW = 1.9512, p-value = 0.1114
## alternative hypothesis: true autocorrelation is greater than 0
par(mfrow=c(1,2))
shapiro.test(mod_try_2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod_try_2$residuals
## W = 0.99058, p-value = 1.018e-11
plot(density(residuals(mod_try_2)))
shapiro.test(mod_log_try_2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod_log_try_2$residuals
## W = 0.99296, p-value = 1.277e-09
plot(density(residuals(mod_log_try_2)))

After all these checks and the general statistical analysis, I would choose the model “mod_try_2” has the best predictive model for the neonati dataset. This model has the 73.85% of capability to explain the variability in birth weight and it was crucial for my choice the removal of the 1594th observation that makes this model meet (although weakly) homoscedasticity. The further removal of 1306th and 155th observations was just a refinement to better improve (even tough not enough) the residuals normality assumption.

Let’s now use this predictive model to provide a value for the birth weight of a newborn, given N.Gravidanze equal to 3 (but this value will not be used in our model since that variable was discarded as not impactful), Gestazione equal to 39 and female Sesso. No measures are provided by the ecography, so I will use Lunghezza and Cranio mean values from neonati_try_2 dataset.

mean_lunghezza <- mean(neonati_try_2$Lunghezza, na.rm = TRUE)
mean_cranio <- mean(neonati_try_2$Cranio, na.rm = TRUE)

nuova_neonata <- data.frame(Gestazione = 39, Lunghezza = mean_lunghezza, Cranio = mean_cranio, Sesso = "F")
predizione_peso <- predict(mod_try_2, newdata = nuova_neonata)

predizione_peso
##        1 
## 3245.524

The newborn weight is predicted to be 3245.524 grams.

Let’s now try to create some spatial visualizations of our predictive model.

library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
neonati_try_2$predicted_peso <- predict(mod_try_2, neonati_try_2)

plot_ly(neonati_try_2, x = ~Lunghezza, y = ~Cranio, z = ~predicted_peso, 
        type = "scatter3d", mode = "markers", color = ~predicted_peso) %>%
  layout(scene = list(xaxis = list(title = 'Lunghezza'),
                      yaxis = list(title = 'Cranio'),
                      zaxis = list(title = 'Peso Predetto')))
plot_ly(neonati_try_2, x = ~Lunghezza, y = ~Gestazione, z = ~predicted_peso, 
        type = "scatter3d", mode = "markers", color = ~predicted_peso) %>%
  layout(scene = list(xaxis = list(title = 'Lunghezza'),
                      yaxis = list(title = 'Gestazione'),
                      zaxis = list(title = 'Peso Predetto')))
plot_ly(neonati_try_2, x = ~Cranio, y = ~Gestazione, z = ~predicted_peso, 
        type = "scatter3d", mode = "markers", color = ~predicted_peso) %>%
  layout(scene = list(xaxis = list(title = 'Cranio'),
                      yaxis = list(title = 'Gestazione'),
                      zaxis = list(title = 'Peso Predetto')))
plot_ly(neonati_try_2, x = ~Lunghezza, y = ~Cranio, z = ~predicted_peso, 
        type = "scatter3d", mode = "markers", 
        color = ~factor(Sesso),
        colors = c('pink', 'blue'), 
        marker = list(symbol = ~ifelse(Sesso == 0, 'circle', 'diamond'))) %>% 
  layout(scene = list(xaxis = list(title = 'Lunghezza'),
                      yaxis = list(title = 'Cranio'),
                      zaxis = list(title = 'Peso Predetto')))